##  Organize data for '53 paper
rm(list = ls())
library(readxl)
library(Hmisc)

setwd("~/Dropbox/Current projects/1953/Replication archive")
dat <- data.frame(read_excel("gdr municipalities with RIAS and NWDR NEW.xlsx", 1))


##  choose signal strength measure to use
signal <- "tirem_3"
dat$signal <- dat[, signal]


##  import train distance data and compute train distances
##  trains2 and hubs2 restricted to hubs of at least size category 6
trains1 <- data.frame(read_excel("munisknot1_converted.xls",1))
trains2 <- data.frame(read_excel("munisknot2_converted.xls",1))
hubs1 <- data.frame(read_excel("knotenbahnhofentfernungen.xlsx",1))
hubs2 <- data.frame(read_excel("knotenbahnhofentfernungen 2.xlsx",1))

dat$traindist1 <- NULL
dat$traindist2 <- NULL

m <- 1
for(m in 1:nrow(dat))	{

	dat$traindist1[m] <- min(c(
								hubs1[trains1[m,22],8] + 2*trains1[m,21]/1000,
								2*dat$distance_to_berlin[m]/1000
								)
								)
	dat$traindist2[m] <- min(c(
								hubs2[trains2[m,22],8] + 2*trains2[m,21]/1000,
								2*dat$distance_to_berlin[m]/1000
								)
								)

						}

cor(dat$distance_to_berlin, dat$traindist1)
cor(dat$distance_to_berlin, dat$traindist2)

plot(dat$distance_to_berlin / 1000, dat$traindist1)
lines(x = c(0,1000), y = c(0,1000), lwd = 2, col = "red")

plot(dat$distance_to_berlin / 1000, dat$traindist2)
lines(x = c(0,1000), y = c(0,1000), lwd = 2, col = "red")



##  Should municipalities be dropped/recoded to account for possible jamming?
##  jamming = km10, km12, km15; adjust = "drop", "min"
jamming <- ""
adjust <- ""

km10 <- data.frame(read_excel("10_buffer_join.xlsx", 1))
km12 <- data.frame(read_excel("12_buffer_join.xlsx", 1))
km15 <- data.frame(read_excel("15_buffer_join.xlsx", 1))


## drop 10km
if(jamming == "km10" & adjust == "drop")	{

	sel <- unique(km10$id)
	sel <- sel[sel != 0]
	dat <- dat[!is.element(dat$id,sel),]

											}


## drop 12km
if(jamming == "km12" & adjust == "drop")	{

	sel <- unique(km12$id)
	sel <- sel[sel != 0]
	dat <- dat[!is.element(dat$id,sel),]

											}

## drop 15km
if(jamming == "km15" & adjust == "drop")	{

	sel <- unique(km15$id)
	sel <- sel[sel != 0]
	dat <- dat[!is.element(dat$id,sel),]

											}


## set to minimum signal strength 10km
if(jamming == "km10" & adjust == "min")		{

	sel <- unique(km10$id)
	sel <- sel[sel != 0]
	dat$signal[is.element(dat$id,sel)] <- min(dat$signal)

											}


## set to minimum signal strength 12km
if(jamming == "km12" & adjust == "min")		{

	sel <- unique(km12$id)
	sel <- sel[sel != 0]
	dat$signal[is.element(dat$id,sel)] <- min(dat$signal)

											}

## set to minimum signal strength 15km
if(jamming == "km15" & adjust == "min")		{

	sel <- unique(km15$id)
	sel <- sel[sel != 0]
	dat$signal[is.element(dat$id,sel)] <- min(dat$signal)

											}



##  do some data cleaning stuff...

##  drop East Berlin from the sample
sel <- dat$municipality == "Berlin (Ost)"
table(sel)
dat <- dat[!sel,]


dat$protest[is.na(dat$protest)] <- 0
table(dat$protest)
stopifnot(sum(is.na(dat$protest)) == 0)


dat$county_capital[is.na(dat$county_capital)] <- 0
table(dat$county_capital)
stopifnot(sum(is.na(dat$county_capital)) == 0)


## recode size class 12 to 11 to avoid separation problems in probit fits
dat$size[dat$size == 12] <- 11
table(dat$size)



##  (other protest data that we ended up not using because coverage is very limited)
dat$protest_hornisse[is.na(dat$protest_hornisse)] <- 0
table(dat$protest_hornisse)
stopifnot(sum(is.na(dat$protest_hornisse)) == 0)


dat$KVP[is.na(dat$KVP)] <- 0
table(dat$KVP)
stopifnot(sum(is.na(dat$KVP)) == 0)


## distance to Berlin in km
dat$distance_to_berlin <- dat$distance_to_berlin / 1000
range(dat$distance_to_berlin)



##
##  import 1946 covariates spatially weighted to conform to
##  1953 counties
dat.1946.m1 <- data.frame(read_excel("5349imp1allcovs.xls", 1))
dat.1946.m2 <- data.frame(read_excel("5349imp2allcovs.xls", 1))
dat.1946.m3 <- data.frame(read_excel("5349imp3allcovs.xls", 1))
dat.1946.m4 <- data.frame(read_excel("5349imp4allcovs.xls", 1))
dat.1946.m5 <- data.frame(read_excel("5349imp5allcovs.xls", 1))


##  import other covariates that all have geen spatially weighted to conform to
##  1953 counties
dat.1950 <- data.frame(read_excel("1950_1953 covariates.xls", 1))
dat.1952 <- data.frame(read_excel("1952_1953 covariates.xlsx", 1))
dat.1932 <- data.frame(read_excel("1932_1953 covariates.xls", 1))

sel <- which(dat.1932[,3] == "Berlin Ost")
dat.1932 <- dat.1932[-sel,]

sel <- which(dat.1932[,3] == "Berlin West")
dat.1932 <- dat.1932[-sel,]


##  consistency check: county names are identical for all covariate datasets
stopifnot(as.character(dat.1932[,3]) == as.character(dat.1950[,4]))
stopifnot(as.character(dat.1932[,3]) == as.character(dat.1952[,4]))
stopifnot(as.character(dat.1932[,3]) == as.character(dat.1946.m1[,4]))
stopifnot(as.character(dat.1932[,3]) == as.character(dat.1946.m2[,4]))
stopifnot(as.character(dat.1932[,3]) == as.character(dat.1946.m3[,4]))
stopifnot(as.character(dat.1932[,3]) == as.character(dat.1946.m4[,4]))
stopifnot(as.character(dat.1932[,3]) == as.character(dat.1946.m5[,4]))


##  merge covariates with municipality dataset
temp <- matrix(NA, nrow(dat), 60)
colnames(temp) <- c(
"turnout.m1",
"invalid.m1",
"SED.m1",
"LDP.m1",
"CDU.m1",
"turnout.m2",
"invalid.m2",
"SED.m2",
"LDP.m2",
"CDU.m2",
"turnout.m3",
"invalid.m3",
"SED.m3",
"LDP.m3",
"CDU.m3",
"turnout.m4",
"invalid.m4",
"SED.m4",
"LDP.m4",
"CDU.m4",
"turnout.m5",
"invalid.m5",
"SED.m5",
"LDP.m5",
"CDU.m5",
"LandForstwirtschaft",
"Bergbau",
"Metallverarbeitung",
"Chemie",
"HolzKunstmassen",
"Verbrauchsgueter",
"Bauwirtschaft",
"Verkehrswesen",
"Handel",
"Dienstleistung",
"Unemployed",
"population",
"prop_male",
"area",
"density",
"pop_change",
"prop_Land_same39",
"prop_educ_8",
"prop_educ_10",
"prop_educ_12",
"prop_educ_12plus",
"prop_educ_Fachschule",
"prop_educ_Hochschule",
"prop_evangelisch",
"prop_katholisch",
"prop_glaubenslos",
"living_space",
"turnout32",
"invalid32",
"NSDAP32",
"SPD32",
"KPD32",
"Z32",
"DNVP32",
"DVP32"
)


m <- 1
for(m in 1:nrow(temp))	{

	sel <- which(as.character(dat.1946.m1$countyIDupdated) == as.character(dat$county[m]))
	if(length(sel) != 1)	{ stop() }

	## 1946 covariates--m1
	temp[m,1] <- dat.1946.m1[sel,"turnout"]
	temp[m,2] <- dat.1946.m1[sel,"invalid"]
	temp[m,3] <- dat.1946.m1[sel,"sed"]
	temp[m,4] <- dat.1946.m1[sel,"ldp"]
	temp[m,5] <- dat.1946.m1[sel,"cdu"]

	## 1946 covariates--m2
	temp[m,6] <- dat.1946.m2[sel,"turnout"]
	temp[m,7] <- dat.1946.m2[sel,"invalid"]
	temp[m,8] <- dat.1946.m2[sel,"sed"]
	temp[m,9] <- dat.1946.m2[sel,"ldp"]
	temp[m,10] <- dat.1946.m2[sel,"cdu"]

	## 1946 covariates--m3
	temp[m,11] <- dat.1946.m3[sel,"turnout"]
	temp[m,12] <- dat.1946.m3[sel,"invalid"]
	temp[m,13] <- dat.1946.m3[sel,"sed"]
	temp[m,14] <- dat.1946.m3[sel,"ldp"]
	temp[m,15] <- dat.1946.m3[sel,"cdu"]

	## 1946 covariates--m4
	temp[m,16] <- dat.1946.m4[sel,"turnout"]
	temp[m,17] <- dat.1946.m4[sel,"invalid"]
	temp[m,18] <- dat.1946.m4[sel,"sed"]
	temp[m,19] <- dat.1946.m4[sel,"ldp"]
	temp[m,20] <- dat.1946.m4[sel,"cdu"]

	## 1946 covariates--m5
	temp[m,21] <- dat.1946.m5[sel,"turnout"]
	temp[m,22] <- dat.1946.m5[sel,"invalid"]
	temp[m,23] <- dat.1946.m5[sel,"sed"]
	temp[m,24] <- dat.1946.m5[sel,"ldp"]
	temp[m,25] <- dat.1946.m5[sel,"cdu"]


	## 1952 covariates
	temp[m,26] <- dat.1952[sel,24]
	temp[m,27] <- dat.1952[sel,30]
	temp[m,28] <- dat.1952[sel,36]
	temp[m,29] <- dat.1952[sel,42]
	temp[m,30] <- dat.1952[sel,48]
	temp[m,31] <- dat.1952[sel,54]
	temp[m,32] <- dat.1952[sel,60]
	temp[m,33] <- dat.1952[sel,66]
	temp[m,34] <- dat.1952[sel,72]
	temp[m,35] <- dat.1952[sel,78]
	temp[m,36] <- dat.1952[sel,84]


	## 1950 covariates
	## use ArcGIS county area instead of approximated area; rho = .997
	temp[m,37] <- dat.1950[sel,41]  ## use pop. figures from 1957 stat yearb for 1950
	temp[m,38] <- dat.1950[sel,12]
	temp[m,39] <- dat.1950[sel,2]	## ArcGIS county area
	temp[m,40] <- dat.1950[sel,16]
	temp[m,41] <- dat.1950[sel,18]
	temp[m,42] <- dat.1950[sel,20]
	temp[m,43] <- dat.1950[sel,22]
	temp[m,44] <- dat.1950[sel,24]
	temp[m,45] <- dat.1950[sel,26]
	temp[m,46] <- dat.1950[sel,28]
	temp[m,47] <- dat.1950[sel,30]
	temp[m,48] <- dat.1950[sel,32]
	temp[m,49] <- dat.1950[sel,34]
	temp[m,50] <- dat.1950[sel,36]
	temp[m,51] <- dat.1950[sel,38]
	temp[m,52] <- dat.1950[sel,40]

	## recompute population density using ArcGIS area
	temp[m,40] <- temp[m,37] / temp[m,39]

	temp[m,53] <- dat.1932[sel,14] / dat.1932[sel,11]
	temp[m,54] <- dat.1932[sel,13] / dat.1932[sel,11]
	temp[m,55] <- dat.1932[sel,15] / dat.1932[sel,11]
	temp[m,56] <- dat.1932[sel,16] / dat.1932[sel,11]
	temp[m,57] <- dat.1932[sel,17] / dat.1932[sel,11]
	temp[m,58] <- dat.1932[sel,18] / dat.1932[sel,11]
	temp[m,59] <- dat.1932[sel,19] / dat.1932[sel,11]
	temp[m,60] <- dat.1932[sel,20] / dat.1932[sel,11]

						}

stopifnot(sum(is.na(temp)) == 0)


##  Compare spatially weighted population figures to 1950 population figures from
##  1957 statistical yearbook
cor(dat.1950[,10], dat.1950[,41])
cor(dat.1950[is.na(dat.1950[,42]),10], dat.1950[is.na(dat.1950[,42]),41])
## omit imputated data to account for county changes between '53 and '58; same result

plot(dat.1950[,10], dat.1950[,41], xlab = "Population size (spatially weighted)",
ylab = "Population size (recomputed)", cex.lab = 1.2, cex.axis = 1.2,
col = "#7F7F7FB2")
lines(x = c(-1000000,1000000), y = c(-1000000,1000000), col = "#7F7F7FB2", lwd = 1.5,
lty = 1)



##  transform sector shares into percentages
total <- apply(temp[,26:36],1,sum)
temp[,26] <- temp[,26] / total
temp[,27] <- temp[,27] / total
temp[,28] <- temp[,28] / total
temp[,29] <- temp[,29] / total
temp[,30] <- temp[,30] / total
temp[,31] <- temp[,31] / total
temp[,32] <- temp[,32] / total
temp[,33] <- temp[,33] / total
temp[,34] <- temp[,34] / total
temp[,35] <- temp[,35] / total
temp[,36] <- temp[,36] / total

dat <- cbind(dat,temp)



##  Create covariate table
covar <- matrix(NA,51,5)
colnames(covar) <- c("mean", "sd", "min", "max", "source")
rownames(covar) <- c(
"size class 4 (1k--1.5k)",
"size class 5 (1.5k--2k)",
"size class 6 (2k--3k)",
"size class 7 (3k--5k)",
"size class 8 (5k--10k)",
"size class 9 (10k--20k)",
"size class 10 (20k--50k)",
"size class 11 (50k+)",
"county capital",
"distance to East Berlin (km)",
"KVP base",
"% turnout 1946 elections",
"% invalid votes 1946 elections",
"% SED 1946 elections",
"% LDP 1946 elections",
"% CDU 1946 elections",
"% agriculture and forestry",
"% mining",
"% metalworking industry",
"% chemical industry",
"% woods and plastics",
"% consumer goods",
"% construction",
"% transportation",
"% commerce",
"% services",
"% unemployed",
"population size",
"% population male",
"population density",
"% population change 1939--50",
"% population in same {\\it Land} as in 1939",
"% 8 years of education",
"% 10 years of education",
"% 12 years of education",
"% more than 12 years of education",
"% trade school degree",
"% university degree",
"% protestant",
"% catholic",
"% agnostic",
"living space per capita (m$^2$)",
"% turnout 1932 elections",
"% invalid votes 1932 elections",
"% NSDAP 1932 elections",
"% SPD 1932 elections",
"% KPD 1932 elections",
"% Zentrum 1932 elections",
"% DNVP 1932 elections",
"% DVP 1932 elections",
"train distance to East Berlin (km)"
)


## helper function to compute mean, sd, and range of data
su <- function(temp)	{ return(c(mean(temp),sd(temp),min(temp),max(temp))) }


## municipality-level covariates
covar[1:8,1] <- table(dat$size) / nrow(dat)
covar[9,1] <- mean(dat$county_capital)
covar[10,1:4] <- su(dat$distance_to_berlin)
covar[11,1] <- mean(dat$KVP)


## county-level covariates, averaged over imputations
covar[12,1:4] <- su(c(	dat.1946.m1$turnout,
						dat.1946.m2$turnout,
						dat.1946.m3$turnout,
						dat.1946.m4$turnout,
						dat.1946.m5$turnout))

covar[13,1:4] <- su(c(	dat.1946.m1$invalid,
						dat.1946.m2$invalid,
						dat.1946.m3$invalid,
						dat.1946.m4$invalid,
						dat.1946.m5$invalid))

covar[14,1:4] <- su(c(	dat.1946.m1$sed,
						dat.1946.m2$sed,
						dat.1946.m3$sed,
						dat.1946.m4$sed,
						dat.1946.m5$sed))

covar[15,1:4] <- su(c(	dat.1946.m1$ldp,
						dat.1946.m2$ldp,
						dat.1946.m3$ldp,
						dat.1946.m4$ldp,
						dat.1946.m5$ldp))

covar[16,1:4] <- su(c(	dat.1946.m1$cdu,
						dat.1946.m2$cdu,
						dat.1946.m3$cdu,
						dat.1946.m4$cdu,
						dat.1946.m5$cdu))


total <- apply(dat.1952[, seq(from = 24, to = 84, by = 6)], 1, sum)

covar[17,1:4] <- su(dat.1952[,24]/total)
covar[18,1:4] <- su(dat.1952[,30]/total)
covar[19,1:4] <- su(dat.1952[,36]/total)
covar[20,1:4] <- su(dat.1952[,42]/total)
covar[21,1:4] <- su(dat.1952[,48]/total)
covar[22,1:4] <- su(dat.1952[,54]/total)
covar[23,1:4] <- su(dat.1952[,60]/total)
covar[24,1:4] <- su(dat.1952[,66]/total)
covar[25,1:4] <- su(dat.1952[,72]/total)
covar[26,1:4] <- su(dat.1952[,78]/total)
covar[27,1:4] <- su(dat.1952[,84]/total)


covar[28,1:4] <- su(dat.1950[,41])
covar[29,1:4] <- su(dat.1950[,12])
covar[30,1:4] <- su(dat.1950[,41] / dat.1950[,2])
covar[31,1:4] <- su(dat.1950[,18])
covar[32,1:4] <- su(dat.1950[,20])
covar[33,1:4] <- su(dat.1950[,22])
covar[34,1:4] <- su(dat.1950[,24])
covar[35,1:4] <- su(dat.1950[,26])
covar[36,1:4] <- su(dat.1950[,28])
covar[37,1:4] <- su(dat.1950[,30])
covar[38,1:4] <- su(dat.1950[,32])
covar[39,1:4] <- su(dat.1950[,34])
covar[40,1:4] <- su(dat.1950[,36])
covar[41,1:4] <- su(dat.1950[,38])
covar[42,1:4] <- su(dat.1950[,40])


covar[43,1:4] <- su(dat.1932[,14] / dat.1932[,11])
covar[44,1:4] <- su(dat.1932[,13] / dat.1932[,11])
covar[45,1:4] <- su(dat.1932[,15] / dat.1932[,11])
covar[46,1:4] <- su(dat.1932[,16] / dat.1932[,11])
covar[47,1:4] <- su(dat.1932[,17] / dat.1932[,11])
covar[48,1:4] <- su(dat.1932[,18] / dat.1932[,11])
covar[49,1:4] <- su(dat.1932[,19] / dat.1932[,11])
covar[50,1:4] <- su(dat.1932[,20] / dat.1932[,11])

covar[51,1:4] <- su(dat$traindist1)

covar2 <- covar


##  helper function to format covariates table
form <- function(temp, digits = 3)	{

		out <- rep(NA,length(temp))
		for(m in 1:length(temp))		{
		
			out[m] <- paste("$", format(round(as.numeric(temp[m]), digits = digits),
     		scientific = FALSE, nsmall = digits), "$", sep = "")
			if(out[m] == "$NA$") out[m] <- ""

										}
		return(out)

									}

covar2[,1] <- form(covar2[,1])
covar2[,2] <- form(covar2[,2])
covar2[,3] <- form(covar2[,3])
covar2[,4] <- form(covar2[,4])


##  add sources
covar2[1:10,5] <- "Krupkat 1958"
covar2[10,5] <- "ArcGIS"
covar2[11,5] <- "Diedrich and Wenzke 2003: 58"
covar2[12:16,5] <- "Braun 1993"
covar2[17:41,5] <- "August 1950 census"
covar2[42,5] <- "June 1950 census"
covar2[43:50,5] <- "Falter and H\\\"anisch 1990"
covar2[51,5] <- "Deutsche Reichsbahn 1953"

latex(covar2, file = "")


##  create histogram of municipality sizes with % with protests.
barplot(t(table(dat$size, dat$protest)), xlab = "Municipality size class",
ylab = "Number of municipalities", cex.axis = 1.2, cex.lab = 1.2,
col = c("blue4","chartreuse4"), ylim = c(0,1000), border = NA,
legend.text = c("no protest","protest"), args.legend = list(border = NA, bty = "n"))

table(dat$size, dat$protest)[,2] / table(dat$size) 



rm(		sel,temp,dat.1932,dat.1946.m1,dat.1946.m2,dat.1946.m3,dat.1946.m4,dat.1946.m5,
		dat.1950,dat.1952,m,total,km10,km12,km15,trains1,trains2,hubs1,hubs2)

save.image(paste("1953data", signal, adjust, jamming, ".RData", sep = ""))













